library(xgboost)
library(randomForest)
library(tidyverse)
library(lubridate)

source('functions.r')
load("Table_construction.Rdata")

Violent Recidivism

Fit age polynomial

 age_subset = features %>%
  filter(`Risk of Recidivism_decile_score` != -1, `Risk of Violence_decile_score` != -1) %>% # Filter 1
  select(p_current_age, p_recid_raw = `Risk of Violence_raw_score`) %>%
  filter(p_current_age > 18 & p_current_age <= 70) %>% # Filter 5
  group_by(p_current_age) %>%
  arrange(p_recid_raw, .by_group=TRUE) %>%
  top_n(n=-1, wt=p_recid_raw) # Filter 6

mdl_age = lm(p_recid_raw ~ 
               I(p_current_age^4) + 
               I(p_current_age^3) + 
               I(p_current_age^2) + 
               p_current_age, 
             data=age_subset)


summary(mdl_age)
## 
## Call:
## lm(formula = p_recid_raw ~ I(p_current_age^4) + I(p_current_age^3) + 
##     I(p_current_age^2) + p_current_age, data = age_subset)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.57815 -0.00957  0.00722  0.03974  0.13353 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.554e+00  8.206e-01   1.894 0.061779 .  
## I(p_current_age^4)  4.078e-07  2.904e-07   1.404 0.164074    
## I(p_current_age^3) -8.818e-05  5.109e-05  -1.726 0.088148 .  
## I(p_current_age^2)  7.437e-03  3.227e-03   2.304 0.023731 *  
## p_current_age      -3.161e-01  8.631e-02  -3.662 0.000441 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09085 on 82 degrees of freedom
## Multiple R-squared:  0.9799, Adjusted R-squared:  0.979 
## F-statistic:  1001 on 4 and 82 DF,  p-value: < 2.2e-16
print("Coefficients:")
## [1] "Coefficients:"
sprintf("%.20e",mdl_age$coefficients) # More precision for paper
## [1] "1.55398886995651119847e+00"  "4.07808994844963593304e-07" 
## [3] "-8.81767707692974442345e-05" "7.43693932415256248047e-03" 
## [5] "-3.16106532728205047444e-01"

Generic stuff (applies to all models)

### Add useful columns to features and apply row filters used for all models
features_filt = features %>%
  inner_join(
    data_before %>% 
      select(person_id, screening_date, people) %>%
      unnest() %>%
      select(person_id, screening_date, race, sex, name),
    by = c("person_id","screening_date")
  ) %>%
  inner_join(features_on, by = c("person_id","screening_date")) %>%
  inner_join(outcomes, by = c("person_id","screening_date")) %>%
  filter(`Risk of Recidivism_decile_score` != -1, `Risk of Violence_decile_score` != -1) %>% # Filter 1
  filter(!is.na(current_offense_date)) %>% # Filter 3
  mutate(p_recid_raw = `Risk of Violence_raw_score`,
         age_poly = predict(mdl_age, newdata=data.frame(p_current_age=p_current_age)),
         p_recid_raw_noage = p_recid_raw - age_poly)

## Set parameters (each combination will be run)
param <- list(objective = "reg:linear",
              eval_metric = "rmse",
              eta = c(.05,.1),
              gamma = c(.5, 1), 
              max_depth = c(2,5),
              min_child_weight = c(5,10),
              subsample = c(1),
              colsample_bytree = c(1)
)

# svm
param_svm = list(
  type = 'eps-regression',
  cost = c(0.5,1,2),
  epsilon = c(0.5,1,1.5),
  gamma_scale = c(0.5,1,2)
)
res_rmse = data.frame(Group = 1:4, lm = NA, xgb = NA, rf = NA, svm = NA)
## Age polynomial
xmin = 18
xmax = 70
xx = seq(xmin,xmax, length.out=1000)

ggplot()+
  geom_point(aes(x=p_current_age, p_recid_raw, color="b"), alpha=.3, data=features_filt) +
  geom_line(aes(x=xx, predict(mdl_age, newdata=data.frame(p_current_age=xx)), color="a")) +
  theme_bw()+
  xlim(xmin,xmax)+
  xlab("Age at COMPAS screening date") +
  ylab("COMPAS violence raw") +
  theme(text = element_text(size=12),
        axis.text=element_text(size=12),
        legend.position="none")
## Warning: Removed 19 rows containing missing values (geom_point).

ggsave("Figures/age_agePoly_violent.pdf",width = 3.5, height = 2.5, units = "in")
## Warning: Removed 19 rows containing missing values (geom_point).
### Number of priors vs. COMPAS remainder
ggplot(data=features_filt) +
  geom_point(aes(x=p_charge_violent, y=p_recid_raw_noage), alpha=.3)+
  theme_bw() +
  xlab("Number of prior violent charges") +
  ylab("COMPAS violence raw remainder") +
  theme(
        text = element_text(size=12),
        axis.text=element_text(size=12))

ggsave("Figures/priors_rawScoreRemain_violent.pdf",width = 3.5, height = 3.5, units = "in")
## Total violence history items
ggplot(features_filt) +
  geom_point(aes(y = p_recid_raw_noage,
                 x=p_juv_fel_count+
                   p_felprop_violarrest+
                   p_murder_arrest+
                   p_felassault_arrest+
                   p_misdemassault_arrest+
                   p_famviol_arrest+
                   p_sex_arrest+
                   p_weapons_arrest),
             alpha=.3)  +
  xlab("Total Violence History items") +
  ylab("COMPAS violence raw remainder")+
  xlim(0,25) +
  theme_bw() +
  theme(text = element_text(size=12),
        axis.text=element_text(size=12))
## Warning: Removed 3 rows containing missing values (geom_point).

ggsave("Figures/violenceHist_rawScoreRemain_violent.pdf",width = 3.5, height = 3.5, units = "in")
## Warning: Removed 3 rows containing missing values (geom_point).

Replicating ProPublica logistic regression

propub = features_filt %>%
  filter(current_offense_date <= current_offense_date_limit) %>% # Only people with valid recidivism values
  mutate(age_low = if_else(p_current_age < 25,1,0), 
         age_high = if_else(p_current_age > 45,1,0),
         female = if_else(sex=="Female",1,0),
         n_priors = p_felony_count_person + p_misdem_count_person,
         compas_high = if_else(`Risk of Violence_decile_score` >= 5, 1, 0), # Medium and High risk scores get +1 label
         race = relevel(factor(race), ref="Caucasian")) # Base level is Caucasian, as in ProPublica analysis
mdl_glm = glm(compas_high ~
                female +
                age_high +
                age_low +
                as.factor(race) +
                p_charge +
                is_misdem +
                recid_violent,
                family=binomial(link='logit'), data=propub)

summary(mdl_glm)
## 
## Call:
## glm(formula = compas_high ~ female + age_high + age_low + as.factor(race) + 
##     p_charge + is_misdem + recid_violent, family = binomial(link = "logit"), 
##     data = propub)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6118  -0.6001  -0.3213   0.7395   2.9655  
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     -2.368882   0.094675 -25.021  < 2e-16 ***
## female                          -0.651197   0.099532  -6.543 6.05e-11 ***
## age_high                        -1.727458   0.174615  -9.893  < 2e-16 ***
## age_low                          2.593419   0.079247  32.726  < 2e-16 ***
## as.factor(race)African-American  0.745976   0.083088   8.978  < 2e-16 ***
## as.factor(race)Asian            -0.765381   0.678659  -1.128    0.259    
## as.factor(race)Hispanic          0.005838   0.146006   0.040    0.968    
## as.factor(race)Native American   0.503070   0.740727   0.679    0.497    
## as.factor(race)Other            -0.039456   0.170241  -0.232    0.817    
## p_charge                         0.074006   0.004375  16.915  < 2e-16 ***
## is_misdem                       -0.362223   0.077149  -4.695 2.67e-06 ***
## recid_violent                    0.704532   0.093094   7.568 3.79e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7378.0  on 5758  degrees of freedom
## Residual deviance: 4934.2  on 5747  degrees of freedom
## AIC: 4958.2
## 
## Number of Fisher Scoring iterations: 6

Group 1 models: predicting (raw - age polynomial) without using age variables or race

### Create group 1 training data

## Select features and round count features
train = features_filt %>%
  transmute(
    #p_current_age,
    p_age_first_offense,
    p_juv_fel_count = pmin(p_juv_fel_count,2),
    p_felprop_violarrest = pmin(p_felprop_violarrest,5),
    p_murder_arrest = pmin(p_murder_arrest,3),
    p_felassault_arrest = pmin(p_felassault_arrest,3),
    p_misdemassault_arrest = pmin(p_misdemassault_arrest,3),
    #p_famviol_arrest = pmin(p_famviol_arrest,3),
    p_sex_arrest = pmin(p_sex_arrest,3),
    p_weapons_arrest = pmin(p_weapons_arrest,3),
    p_n_on_probation = pmin(p_n_on_probation,5),
    p_current_on_probation = pmin(p_current_on_probation,5),
    p_prob_revoke = pmin(p_prob_revoke,5),
    p_recid_raw_noage)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-p_recid_raw_noage) %>% as.matrix(),
  "label" = train %>% select(p_recid_raw_noage) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(p_recid_raw_noage ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = p_recid_raw_noage ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9576 -0.3445 -0.1178  0.2532  3.7943 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.8451607  0.0153243  55.152  < 2e-16 ***
## p_age_first_offense    -0.0065319  0.0004888 -13.362  < 2e-16 ***
## p_juv_fel_count         0.1306356  0.0196060   6.663 2.84e-11 ***
## p_felprop_violarrest    0.1124162  0.0061483  18.284  < 2e-16 ***
## p_murder_arrest         0.1027832  0.0481811   2.133   0.0329 *  
## p_felassault_arrest     0.1810567  0.0114713  15.783  < 2e-16 ***
## p_misdemassault_arrest  0.1669203  0.0107051  15.593  < 2e-16 ***
## p_sex_arrest            0.1747182  0.0407972   4.283 1.87e-05 ***
## p_weapons_arrest        0.1033170  0.0158291   6.527 7.07e-11 ***
## p_n_on_probation        0.0787288  0.0051618  15.252  < 2e-16 ***
## p_current_on_probation  0.1522874  0.0260538   5.845 5.24e-09 ***
## p_prob_revoke           0.1443481  0.0195092   7.399 1.50e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5048 on 9030 degrees of freedom
## Multiple R-squared:   0.32,  Adjusted R-squared:  0.3191 
## F-statistic: 386.3 on 11 and 9030 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==1,]$lm = rmse(predict(mdl_lm, newdata=train), train$p_recid_raw_noage) # ADJUST GROUP

Model 2: xgboost

set.seed(46)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  15          
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.05"      
## gamma            "1"         
## max_depth        "5"         
## min_child_weight "10"        
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$p_recid_raw_noage

res_rmse[res_rmse$Group==1,]$xgb = rmse(pred, actual) # ADJUST GROUP

axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab("COMPAS violence raw remainder") +
  ylab("XGBoost prediction")+
  theme(
        text = element_text(size=14),
        axis.text=element_text(size=14))

### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

Model 3: random forest

set.seed(55656)

mdl_rf = randomForest(
  formula = p_recid_raw_noage ~ .,
  data = train
)

res_rmse[res_rmse$Group==1,]$rf = rmse(mdl_rf$predicted, train$p_recid_raw_noage) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(p_recid_raw_noage ~ ., train, param_svm)
## Training on 27 sets of parameters.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.
## [1] "Best parameters:"
##             19              
## type        "eps-regression"
## cost        "0.5"           
## epsilon     "0.5"           
## gamma_scale "2"             
## gamma       "0.1666667"
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.
res_rmse[res_rmse$Group==1,]$svm = rmse(mdl_svm$fitted, train$p_recid_raw_noage) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)

Group 2 models: predicting (raw - age polynomial) without using age variables but with race

### Create group 2 training data

## Select features and round count features
train = features_filt %>%
  transmute(
    #p_current_age,
    p_age_first_offense,
    p_juv_fel_count = pmin(p_juv_fel_count,2),
    p_felprop_violarrest = pmin(p_felprop_violarrest,5),
    p_murder_arrest = pmin(p_murder_arrest,3),
    p_felassault_arrest = pmin(p_felassault_arrest,3),
    p_misdemassault_arrest = pmin(p_misdemassault_arrest,3),
    #p_famviol_arrest = pmin(p_famviol_arrest,3),
    p_sex_arrest = pmin(p_sex_arrest,3),
    p_weapons_arrest = pmin(p_weapons_arrest,3),
    p_n_on_probation = pmin(p_n_on_probation,5),
    p_current_on_probation = pmin(p_current_on_probation,5),
    p_prob_revoke = pmin(p_prob_revoke,5),
    race_black = if_else(race=="African-American",1,0),
    race_white = if_else(race=="Caucasian",1,0),
    race_hispanic = if_else(race=="Hispanic",1,0),
    race_asian = if_else(race=="Asian",1,0),
    race_native = if_else(race=="Native American",1,0), # race == "Other" is the baseline
    p_recid_raw_noage)


## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-p_recid_raw_noage) %>% as.matrix(),
  "label" = train %>% select(p_recid_raw_noage) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(p_recid_raw_noage ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = p_recid_raw_noage ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9759 -0.3321 -0.1098  0.2460  3.6383 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.6137104  0.0260561  23.553  < 2e-16 ***
## p_age_first_offense    -0.0044461  0.0004968  -8.949  < 2e-16 ***
## p_juv_fel_count         0.1212941  0.0192844   6.290 3.33e-10 ***
## p_felprop_violarrest    0.1088356  0.0060435  18.009  < 2e-16 ***
## p_murder_arrest         0.1060919  0.0473390   2.241    0.025 *  
## p_felassault_arrest     0.1767698  0.0112734  15.680  < 2e-16 ***
## p_misdemassault_arrest  0.1668764  0.0105170  15.867  < 2e-16 ***
## p_sex_arrest            0.1659144  0.0400788   4.140 3.51e-05 ***
## p_weapons_arrest        0.0929469  0.0155661   5.971 2.45e-09 ***
## p_n_on_probation        0.0736827  0.0050787  14.508  < 2e-16 ***
## p_current_on_probation  0.1424988  0.0256041   5.565 2.69e-08 ***
## p_prob_revoke           0.1351758  0.0191733   7.050 1.92e-12 ***
## race_black              0.2747839  0.0225159  12.204  < 2e-16 ***
## race_white              0.1209223  0.0227808   5.308 1.13e-07 ***
## race_hispanic           0.0267705  0.0273158   0.980    0.327    
## race_asian             -0.0244794  0.0753349  -0.325    0.745    
## race_native             0.1538104  0.0962585   1.598    0.110    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4958 on 9025 degrees of freedom
## Multiple R-squared:  0.3442, Adjusted R-squared:  0.3431 
## F-statistic: 296.1 on 16 and 9025 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==2,]$lm = rmse(predict(mdl_lm, newdata=train), train$p_recid_raw_noage) # ADJUST GROUP

Model 2: xgboost

set.seed(390)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  14          
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.1"       
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "10"        
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$p_recid_raw_noage

res_rmse[res_rmse$Group==2,]$xgb = rmse(pred, actual) # ADJUST GROUP

axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab("COMPAS violence raw remainder") +
  ylab("XGBoost prediction")+
  theme(
        text = element_text(size=14),
        axis.text=element_text(size=14))

### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

Model 3: random forest

set.seed(2728)

mdl_rf = randomForest(
  formula = p_recid_raw_noage ~ .,
  data = train
)

res_rmse[res_rmse$Group==2,]$rf = rmse(mdl_rf$predicted, train$p_recid_raw_noage) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(p_recid_raw_noage ~ ., train, param_svm)
## Training on 27 sets of parameters.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.
## [1] "Best parameters:"
##             10              
## type        "eps-regression"
## cost        "0.5"           
## epsilon     "0.5"           
## gamma_scale "1"             
## gamma       "0.05882353"
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.
res_rmse[res_rmse$Group==2,]$svm = rmse(mdl_svm$fitted, train$p_recid_raw_noage) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)

Group 3 models: predicting (raw - age polynomial) without using race but with age variables

### Create group 3 training data

## Select features and round count features
train = features_filt %>%
  transmute(
    p_current_age,
    p_age_first_offense,
    p_juv_fel_count = pmin(p_juv_fel_count,2),
    p_felprop_violarrest = pmin(p_felprop_violarrest,5),
    p_murder_arrest = pmin(p_murder_arrest,3),
    p_felassault_arrest = pmin(p_felassault_arrest,3),
    p_misdemassault_arrest = pmin(p_misdemassault_arrest,3),
    #p_famviol_arrest = pmin(p_famviol_arrest,3),
    p_sex_arrest = pmin(p_sex_arrest,3),
    p_weapons_arrest = pmin(p_weapons_arrest,3),
    p_n_on_probation = pmin(p_n_on_probation,5),
    p_current_on_probation = pmin(p_current_on_probation,5),
    p_prob_revoke = pmin(p_prob_revoke,5),
    p_recid_raw_noage)


## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-p_recid_raw_noage) %>% as.matrix(),
  "label" = train %>% select(p_recid_raw_noage) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(p_recid_raw_noage ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = p_recid_raw_noage ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8853 -0.3396 -0.1105  0.2422  3.8069 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.7750803  0.0158841  48.796  < 2e-16 ***
## p_current_age           0.0140701  0.0009606  14.646  < 2e-16 ***
## p_age_first_offense    -0.0195380  0.0010109 -19.327  < 2e-16 ***
## p_juv_fel_count         0.1216902  0.0193879   6.277 3.62e-10 ***
## p_felprop_violarrest    0.0902790  0.0062621  14.417  < 2e-16 ***
## p_murder_arrest         0.0807082  0.0476452   1.694  0.09031 .  
## p_felassault_arrest     0.1480263  0.0115601  12.805  < 2e-16 ***
## p_misdemassault_arrest  0.1516864  0.0106318  14.267  < 2e-16 ***
## p_sex_arrest            0.1227168  0.0404793   3.032  0.00244 ** 
## p_weapons_arrest        0.0757266  0.0157582   4.806 1.57e-06 ***
## p_n_on_probation        0.0694142  0.0051413  13.501  < 2e-16 ***
## p_current_on_probation  0.1673132  0.0257716   6.492 8.91e-11 ***
## p_prob_revoke           0.0562168  0.0201996   2.783  0.00540 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4989 on 9029 degrees of freedom
## Multiple R-squared:  0.3358, Adjusted R-squared:  0.3349 
## F-statistic: 380.3 on 12 and 9029 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==3,]$lm = rmse(predict(mdl_lm, newdata=train), train$p_recid_raw_noage) # ADJUST GROUP

Model 2: xgboost

set.seed(34)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  5           
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.05"      
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "5"         
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$p_recid_raw_noage

res_rmse[res_rmse$Group==3,]$xgb = rmse(pred, actual) # ADJUST GROUP

axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab("COMPAS violence raw remainder") +
  ylab("XGBoost prediction")+
  theme(
        text = element_text(size=14),
        axis.text=element_text(size=14))

### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

Model 3: random forest

set.seed(7872)

mdl_rf = randomForest(
  formula = p_recid_raw_noage ~ .,
  data = train
)

res_rmse[res_rmse$Group==3,]$rf = rmse(mdl_rf$predicted, train$p_recid_raw_noage) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(p_recid_raw_noage ~ ., train, param_svm)
## Training on 27 sets of parameters.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.
## [1] "Best parameters:"
##             1               
## type        "eps-regression"
## cost        "0.5"           
## epsilon     "0.5"           
## gamma_scale "0.5"           
## gamma       "0.03846154"
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.
res_rmse[res_rmse$Group==3,]$svm = rmse(mdl_svm$fitted, train$p_recid_raw_noage) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)

Group 4 models: predicting (raw - age polynomial) using age variables and race

### Create group 4 training data

## Select features and round count features
train = features_filt %>%
  transmute(
    p_current_age,
    p_age_first_offense,
    p_juv_fel_count = pmin(p_juv_fel_count,2),
    p_felprop_violarrest = pmin(p_felprop_violarrest,5),
    p_murder_arrest = pmin(p_murder_arrest,3),
    p_felassault_arrest = pmin(p_felassault_arrest,3),
    p_misdemassault_arrest = pmin(p_misdemassault_arrest,3),
    #p_famviol_arrest = pmin(p_famviol_arrest,3),
    p_sex_arrest = pmin(p_sex_arrest,3),
    p_weapons_arrest = pmin(p_weapons_arrest,3),
    p_n_on_probation = pmin(p_n_on_probation,5),
    p_current_on_probation = pmin(p_current_on_probation,5),
    p_prob_revoke = pmin(p_prob_revoke,5),
    race_black = if_else(race=="African-American",1,0),
    race_white = if_else(race=="Caucasian",1,0),
    race_hispanic = if_else(race=="Hispanic",1,0),
    race_asian = if_else(race=="Asian",1,0),
    race_native = if_else(race=="Native American",1,0), # race == "Other" is the baseline
    p_recid_raw_noage)

## Format for xgboost
train_xgb = xgb.DMatrix(
  "data" = train %>% select(-p_recid_raw_noage) %>% as.matrix(),
  "label" = train %>% select(p_recid_raw_noage) %>% as.matrix()
)

Model 1: Linear model

mdl_lm = lm(p_recid_raw_noage ~ ., data=train)
summary(mdl_lm)
## 
## Call:
## lm(formula = p_recid_raw_noage ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9064 -0.3264 -0.1039  0.2364  3.6516 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.5544869  0.0260945  21.249  < 2e-16 ***
## p_current_age           0.0135686  0.0009462  14.340  < 2e-16 ***
## p_age_first_offense    -0.0169983  0.0010038 -16.934  < 2e-16 ***
## p_juv_fel_count         0.1127517  0.0190788   5.910 3.55e-09 ***
## p_felprop_violarrest    0.0876690  0.0061557  14.242  < 2e-16 ***
## p_murder_arrest         0.0839132  0.0468369   1.792  0.07323 .  
## p_felassault_arrest     0.1451296  0.0113640  12.771  < 2e-16 ***
## p_misdemassault_arrest  0.1521143  0.0104506  14.556  < 2e-16 ***
## p_sex_arrest            0.1160088  0.0397845   2.916  0.00356 ** 
## p_weapons_arrest        0.0663196  0.0155042   4.278 1.91e-05 ***
## p_n_on_probation        0.0647496  0.0050606  12.795  < 2e-16 ***
## p_current_on_probation  0.1570690  0.0253391   6.199 5.94e-10 ***
## p_prob_revoke           0.0506041  0.0198557   2.549  0.01083 *  
## race_black              0.2658235  0.0222736  11.934  < 2e-16 ***
## race_white              0.1089137  0.0225424   4.831 1.38e-06 ***
## race_hispanic           0.0308834  0.0270128   1.143  0.25295    
## race_asian             -0.0139690  0.0744987  -0.188  0.85127    
## race_native             0.1331917  0.0951964   1.399  0.16181    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4903 on 9024 degrees of freedom
## Multiple R-squared:  0.3589, Adjusted R-squared:  0.3576 
## F-statistic: 297.1 on 17 and 9024 DF,  p-value: < 2.2e-16
res_rmse[res_rmse$Group==4,]$lm = rmse(predict(mdl_lm, newdata=train), train$p_recid_raw_noage) # ADJUST GROUP

Model 2: xgboost

set.seed(11)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
##                  6           
## objective        "reg:linear"
## eval_metric      "rmse"      
## eta              "0.1"       
## gamma            "0.5"       
## max_depth        "5"         
## min_child_weight "5"         
## subsample        "1"         
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$p_recid_raw_noage

res_rmse[res_rmse$Group==4,]$xgb = rmse(pred, actual) # ADJUST GROUP

axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))

data.frame(xgboost = pred, compas=actual) %>%
  ggplot() +
  geom_point(aes(x=compas,y=xgboost), alpha=.3) +
  geom_abline(slope=1, color="red")+
  xlim(c(axis_min,axis_max)) +
  ylim(c(axis_min,axis_max)) +
  coord_fixed() +
  theme_bw()+
  xlab("COMPAS violence raw remainder") +
  ylab("Prediction of COMPAS violence raw remainder")+
  theme(
        text = element_text(size=12),
        axis.text=element_text(size=12))

ggsave("Figures/rawScoreRemain_xgboost_violent.pdf",width = 4, height = 4, units = "in")
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))

highlight = data.frame(
  person_id= c(799, 1284, 1394, 1497, 1515, 1638, 3145, 3291, 5722, 6337, 6886, 7997, 8200, 8375, 8491, 10553, 10774, 11231, 11312, 11414),
  screening_date = ymd(c("2014-06-15","2014-05-14","2014-11-28","2013-07-29","2013-10-23","2013-10-04","2014-12-14","2013-01-17","2013-10-24","2014-02-04","2013-07-12","2014-04-26","2014-05-05","2013-03-19","2014-01-18","2014-09-20","2013-04-09","2014-02-23","2014-05-02","2014-11-26")),
  highlight = TRUE
)

df_plot = features_filt %>%
  bind_cols(xgboost = predict(mdl_xgb, newdata=train_xgb)) %>%
  left_join(highlight, by = c("person_id","screening_date")) %>%
  mutate(highlight = if_else(is.na(highlight), FALSE, TRUE)) %>%
  mutate(highlight = factor(if_else(highlight==TRUE,"In Table 5", "Not in Table 5"), levels=c("In Table 5", "Not in Table 5")))

person_id_text_topright = c(8491, 8375, 1497)
#person_id_text_topright = highlight$person_id
person_id_text_topleft = c(5722, 11231)
person_id_text_botright = c(799, 11312, 1284, 11414)
person_id_text_botleft = c()

ggplot() +
  geom_point(aes(x=xgboost,y=p_recid_raw, color=highlight),  alpha = .3, data = filter(df_plot, highlight=="Not in Table 5")) +
  geom_point(aes(x=xgboost,y=p_recid_raw, color=highlight),  data = filter(df_plot, highlight=="In Table 5")) +
  theme_bw()+
  geom_text(aes(x=xgboost,y=p_recid_raw,label=name),size=3,nudge_x=0, nudge_y=0, hjust="left",vjust="bottom", data=filter(df_plot, person_id %in% person_id_text_topright & highlight=="In Table 5")) + 
  geom_text(aes(x=xgboost,y=p_recid_raw,label=name),size=3,nudge_x=0, nudge_y=0, hjust="right",vjust="bottom", data=filter(df_plot, person_id %in% person_id_text_topleft & highlight=="In Table 5")) + 
  geom_text(aes(x=xgboost,y=p_recid_raw,label=name),size=3,nudge_x=0, nudge_y=0, hjust="left",vjust="top", data=filter(df_plot, person_id %in% person_id_text_botright & highlight=="In Table 5")) + 
  geom_text(aes(x=xgboost,y=p_recid_raw,label=name),size=3,nudge_x=0, nudge_y=0, hjust="right",vjust="top", data=filter(df_plot, person_id %in% person_id_text_botleft & highlight=="In Table 5")) + 
  xlab("Prediction of COMPAS violence raw remainder") +
  ylab("COMPAS violence raw")+
  theme(
    text = element_text(size=12),
    axis.text=element_text(size=12),
    #legend.position = "top",
    legend.position="none") +
  scale_color_discrete(name = element_blank())

ggsave("Figures/xgboost_rawScore_violent.pdf",width = 4, height = 4, units = "in")

Model 3: random forest

set.seed(379)

mdl_rf = randomForest(
  formula = p_recid_raw_noage ~ .,
  data = train
)

res_rmse[res_rmse$Group==4,]$rf = rmse(mdl_rf$predicted, train$p_recid_raw_noage) # ADJUST GROUP

Model 4: SVM

mdl_svm = fit_svm(p_recid_raw_noage ~ ., train, param_svm)
## Training on 27 sets of parameters.
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.

## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.
## [1] "Best parameters:"
##             2               
## type        "eps-regression"
## cost        "1"             
## epsilon     "0.5"           
## gamma_scale "0.5"           
## gamma       "0.02777778"
## Warning in cret$cresults * scale.factor: Recycling array of length 1 in vector-array arithmetic is deprecated.
##   Use c() or as.vector() instead.
res_rmse[res_rmse$Group==4,]$svm = rmse(mdl_svm$fitted, train$p_recid_raw_noage) # ADJUST GROUP

Cleanup

rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)

Comparison

knitr::kable(res_rmse)
Group lm xgb rf svm
1 0.5044421 0.4831389 0.4951366 0.4816624
2 0.4953577 0.4699155 0.4879687 0.4792197
3 0.4985543 0.4682046 0.4923167 0.4865796
4 0.4898085 0.4565221 0.4833362 0.4759958